{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer\n",
    "from tokenizers.processors import BertProcessing\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = ByteLevelBPETokenizer(lowercase=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/filtered-dumping-academia.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/filtered-dumping-wiki.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-iium.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-news.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-parliament.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-watpadd.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-speech-model/dataset/bahasa-asr-train.json\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-speech-model/dataset/bahasa-asr-test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['inilah dunia kecil',\n",
       " 'itu tadi menarik ya',\n",
       " 'pelayan restoran ini kelihatannya sibuk',\n",
       " 'apakah anda pernah makan serangga',\n",
       " 'aku pernah ingin menjadi seorang astronom fisika',\n",
       " 'saya sedang mencari dompet hitam seperti ini',\n",
       " 'apakah anda percaya pada tuhan',\n",
       " 'semuanya sedang menunggu guru di kelas',\n",
       " 'anak-anak kecil suka dengan buku tentang dinosaurus dan monster',\n",
       " 'ini yang dilakukan mary untuk hidup']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "with open('bahasa-asr-train.json') as fopen:\n",
    "    data = json.load(fopen)['Y']\n",
    "    \n",
    "with open('bahasa-asr-test.json') as fopen:\n",
    "    data.extend(json.load(fopen)['Y'])\n",
    "    \n",
    "data = list(filter(None, data))\n",
    "data[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('asr-text.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['asr-text.txt', 'filtered-dumping-academia.txt', 'filtered-dumping-wiki.txt',\n",
    "        'dumping-iium.txt', 'dumping-news.txt', 'dumping-parliament.txt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "source": [
    "!cat {' '.join(files)} >> train-v3.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "183877906 train-v3.txt\n"
     ]
    }
   ],
   "source": [
    "!wc -w train-v3.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer.train(files=files, vocab_size=32000, min_frequency=2,\n",
    "                show_progress=True,\n",
    "                special_tokens=[\n",
    "                                \"<s>\",\n",
    "                                \"<pad>\",\n",
    "                                \"</s>\",\n",
    "                                \"<unk>\",\n",
    "                                \"<mask>\",\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "mkdir: cannot create directory ‘roberta_bpe’: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir roberta_bpe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['roberta_bpe/vocab.json', 'roberta_bpe/merges.txt']"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_model('roberta_bpe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "import transformers\n",
    "from transformers import RobertaTokenizer, AutoModelForMaskedLM, RobertaConfig, RobertaForMaskedLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
      "The tokenizer class you load from this checkpoint is 'BertTokenizer'. \n",
      "The class this function is called from is 'RobertaTokenizer'.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = RobertaTokenizer.from_pretrained('./roberta_bpe', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('malay-cased-roberta-base/tokenizer_config.json',\n",
       " 'malay-cased-roberta-base/special_tokens_map.json',\n",
       " 'malay-cased-roberta-base/vocab.json',\n",
       " 'malay-cased-roberta-base/merges.txt',\n",
       " 'malay-cased-roberta-base/added_tokens.json')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained('malay-cased-roberta-base')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreTrainedTokenizer(name_or_path='./roberta_bpe', vocab_size=32000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken(\"<mask>\", rstrip=False, lstrip=True, single_word=False, normalized=True)})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
